In [1]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner
hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]
In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
#a = Features(hw_notebooks[0], 'hw0')
#a.add_notebooks(hw_notebooks[1], 'hw1')
a = Features(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=False)
ci = CorpusIdentifier()
fi = FrequentItemsets()
pipe = Pipeline([gastf, rbn, gi, agr, ci, fi])
a = pipe.transform(a)
In [ ]:
In [3]:
notebook_patterns = {}
for bucket in fi.buckets:
name = None
for cell in bucket.items:
name = cell.get_feature('notebook_name')
if name not in notebook_patterns:
notebook_patterns[name] = []
notebook_patterns[name].append(bucket.get_patterns())
In [4]:
notebook_itemsets = {}
for key in notebook_patterns.keys():
itemsets = []
for cell in notebook_patterns[key]:
itemsets.extend(cell)
notebook_itemsets[key] = set(itemsets)
In [5]:
keys = [key for key in notebook_itemsets.keys()]
print(len(notebook_itemsets[keys[4]]))
In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist([len(notebook_itemsets[key]) for key in notebook_itemsets.keys()])
Out[6]:
In [7]:
total_set = []
for key in notebook_itemsets.keys():
total_set.extend(notebook_itemsets[key])
print(len(set(total_set)))
In [8]:
X, y, names = ci.get_data_set_full()
notebook_templates = {}
notebook_labels = {}
for i, name in enumerate(names):
notebook_templates[name] = X[i]
notebook_labels[name] = y[i]
In [9]:
print(len(notebook_labels.keys()))
print(len(notebook_itemsets.keys()))
print(len(notebook_templates.keys()))
In [10]:
notebook_itemsets[None]
Out[10]:
In [11]:
X = []
y = []
for key in notebook_itemsets.keys():
if key in notebook_itemsets and key is not None:
itemset_component = ['itemset_' + '_'.join([temp for temp in el]) for el in notebook_itemsets[key]]
template_component = notebook_templates[key]
X.append(itemset_component + template_component)
y.append(notebook_labels[key])
In [12]:
import tqdm
similarities = np.zeros((len(X), len(X)))
for i in tqdm.tqdm(range(len(X))):
for j in range(len(X)):
if len(set.union(set(X[i]), set(X[j]))) == 0:
continue
similarities[i][j] = len(set.intersection(set(X[i]), set(X[j]))) / (len(set.union(set(X[i]), set(X[j]))))
def get_avg_inter_intra_sims(X, y, val):
inter_sims = []
intra_sims = []
for i in range(len(X)):
for j in range(i+1, len(X)):
if y[i] == y[j] and y[i] == val:
intra_sims.append(similarities[i][j])
else:
inter_sims.append(similarities[i][j])
return np.array(intra_sims), np.array(inter_sims)
for i in np.unique(y):
intra_sims, inter_sims = get_avg_inter_intra_sims(X, y, i)
print('Mean intra similarity for hw',i,'is',np.mean(intra_sims),'with std',np.std(intra_sims))
print('Mean inter similarity for hw',i,'is',np.mean(inter_sims),'with std',np.std(inter_sims))
print('----')
In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 5, 10
def get_all_sims(X, y, val):
sims = []
for i in range(len(X)):
for j in range(i+1, len(X)):
if y[i] == val or y[j] == val:
sims.append(similarities[i][j])
return sims
fig, axes = plt.subplots(6)
for i in range(6):
axes[i].hist(get_all_sims(X,y,i), bins=30)
In [14]:
tot = []
for el in X:
tot.extend(el)
print(len(set(tot)))
In [15]:
print(X[0])
In [16]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
In [17]:
X.shape
Out[17]:
In [ ]:
In [36]:
p = np.random.permutation(len(X.todense()))
Xt = X.todense()[p][:,:-138]
yt = np.array(y)[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=3)
scores = cross_val_score(clf, Xt, yt, cv=10)
print(scores)
print(np.mean(scores))
In [35]:
from sklearn.ensemble import AdaBoostClassifier
p = np.random.permutation(len(X.todense()))
Xt = X.todense()[p]
yt = np.array(y)[p]
clf = sklearn.ensemble.AdaBoostClassifier(n_estimators=400)
scores = cross_val_score(clf, Xt, yt, cv=10)
print(scores)
print(np.mean(scores))
In [33]:
X[:,-138:][1].todense()
Out[33]:
In [19]:
clf.fit(X[:360],y[:360])
Out[19]:
In [20]:
clf.predict(X[300:])
In [ ]:
y[300:]
In [ ]:
print(np.sum(clf.predict(X[:360])== y[:360])/len(y[:360]))
In [ ]:
In [ ]: